{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原\n",
    "# View dataset directory. This directory will be recovered automatically after resetting environment. \n",
    "!ls /home/aistudio/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.\n",
    "# View personal work directory. All changes under this directory will be kept even after reset. Please clean unnecessary files in time to speed up environment loading.\n",
    "!ls /home/aistudio/work"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:\n",
    "# If a persistence installation is required, you need to use the persistence path as the following:\n",
    "!mkdir /home/aistudio/external-libraries\n",
    "!pip install beautifulsoup4 -t /home/aistudio/external-libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可:\r\n",
    "# Also add the following code, so that every time the environment (kernel) starts, just run the following code:\r\n",
    "import sys\r\n",
    "sys.path.append('/home/aistudio/external-libraries')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirror.baidu.com/pypi/simple/\n",
      "Collecting bs4\n",
      "  Downloading https://mirror.baidu.com/pypi/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz\n",
      "Collecting beautifulsoup4 (from bs4)\n",
      "\u001b[?25l  Downloading https://mirror.baidu.com/pypi/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)\n",
      "\u001b[K     |████████████████████████████████| 122kB 12.6MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting soupsieve>1.2 (from beautifulsoup4->bs4)\n",
      "  Downloading https://mirror.baidu.com/pypi/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl\n",
      "Building wheels for collected packages: bs4\n",
      "  Building wheel for bs4 (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for bs4: filename=bs4-0.0.1-cp37-none-any.whl size=1273 sha256=889d63c892f2c23c6a7bc2162fe9b2531253a0b638155db882b591cace1f564e\n",
      "  Stored in directory: /home/aistudio/.cache/pip/wheels/80/e7/37/62c60fe0c1017a55e897489ce5d5e850fa5610745d6352ed0c\n",
      "Successfully built bs4\n",
      "Installing collected packages: soupsieve, beautifulsoup4, bs4\n",
      "Successfully installed beautifulsoup4-4.9.1 bs4-0.0.1 soupsieve-2.0.1\n"
     ]
    }
   ],
   "source": [
    "!pip install bs4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.162277660187101"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "EPSION = 0.0000000001\r\n",
    "\r\n",
    "def sqrt(n):\r\n",
    "    low,high = 1,n/2+1\r\n",
    "    mid = (low+high) / 2\r\n",
    "    while high-low > EPSION:\r\n",
    "        if mid * mid > n:\r\n",
    "            high = mid\r\n",
    "        else:\r\n",
    "            low = mid\r\n",
    "        mid = (high+low) / 2\r\n",
    "    return mid\r\n",
    "sqrt(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/bs4/__init__.py:220: UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.\n",
      "  warnings.warn(\"You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29         东风雪铁龙C4L转向柱松动影响安全                D337,I297,  2020-08-24   厂家受理  \r"
     ]
    }
   ],
   "source": [
    "# 使用request + BeautifulSoup提取12365auto投诉信息\r\n",
    "import requests\r\n",
    "from bs4 import BeautifulSoup\r\n",
    "import pandas as pd\r\n",
    "\r\n",
    "# # 请求URL\r\n",
    "# url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-1.shtml'\r\n",
    "\r\n",
    "# 根据request_url得到soup\r\n",
    "def get_page_content(request_url):\r\n",
    "    # 得到页面的内容\r\n",
    "    header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}\r\n",
    "    html=requests.get(request_url,headers=header,timeout=10)\r\n",
    "    content = html.text\r\n",
    "    #print(content)\r\n",
    "\r\n",
    "    # 通过content创建BeautifulSoup对象\r\n",
    "    soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')\r\n",
    "    return soup\r\n",
    "\r\n",
    "# 分析当前页面的投诉\r\n",
    "def analysis(soup):\r\n",
    "    # 找到完整的投诉信息框\r\n",
    "    temp = soup.find('div',class_=\"tslb_b\")\r\n",
    "    # 创建DataFrame\r\n",
    "    df = pd.DataFrame(columns = ['id', 'brand', 'car_model', 'type', 'desc', 'problem', 'datetime', 'status'])\r\n",
    "    tr_list = temp.find_all('tr')\r\n",
    "    for tr in tr_list:\r\n",
    "        temp = {}\r\n",
    "        td_list = tr.find_all('td')\r\n",
    "        if len(td_list) > 0:\r\n",
    "            id,brand,car_model,type,desc,problem,datetime,status = td_list[0].text,td_list[1].text,td_list[2].text,td_list[3].text,td_list[4].text,td_list[5].text,td_list[6].text,td_list[7].text\r\n",
    "            temp[\"id\"],temp[\"brand\"],temp[\"car_model\"],temp[\"type\"],temp[\"desc\"],temp[\"problem\"],temp[\"datetime\"],temp[\"status\"] = id,brand,car_model,type,desc,problem,datetime,status\r\n",
    "            df = df.append(temp,ignore_index=True)\r\n",
    "    return df\r\n",
    "\r\n",
    "page_num = 20\r\n",
    "base_url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-'\r\n",
    "\r\n",
    "result = pd.DataFrame(columns = ['id', 'brand', 'car_model', 'type', 'desc', 'problem', 'datetime', 'status'])\r\n",
    "for i in range(page_num):\r\n",
    "    request_url = base_url+str(i+1)+'.shtml'\r\n",
    "    soup = get_page_content(request_url)\r\n",
    "    print(request_url)\r\n",
    "    df = analysis(soup)\r\n",
    "    print(df)\r\n",
    "    result = result.append(df)\r\n",
    "    \r\n",
    "result.to_csv('car_complain.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>\n",
    "Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PaddlePaddle 1.8.0 (Python 3.5)",
   "language": "python",
   "name": "py35-paddle1.2.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
